options(warn=-1)
library(pacman)
p_load(dplyr)
source('../lib/import.R')
#
# Es una librería de funciones comunes desarrolladas a partir de este TP.
import('../lib/common-lib.R')
## [1] "-> './reflection.R' script loadded successfuly!"
## [1] "-> './data-frame.R' script loadded successfuly!"
## [1] "-> './csv.R' script loadded successfuly!"
## Installing package into '/usr/local/lib/R/site-library'
## (as 'lib' is unspecified)
## 
## tidyverse installed
## Installing package into '/usr/local/lib/R/site-library'
## (as 'lib' is unspecified)
## 
## tidyverse installed
## Installing package into '/usr/local/lib/R/site-library'
## (as 'lib' is unspecified)
## 
## tidyverse installed
## [1] "-> './plot.R' script loadded successfuly!"
## [1] "-> './data-frame.R' script loadded successfuly!"
## [1] "-> './importance.R' script loadded successfuly!"
## [1] "-> './correlation.R' script loadded successfuly!"
## [1] "-> './pca.R' script loadded successfuly!"
## [1] "-> './set_split.R' script loadded successfuly!"
## [1] "-> './models.R' script loadded successfuly!"
## ---
## biotools version 4.1
## [1] "-> './data-frame.R' script loadded successfuly!"
## [1] "-> './test.R' script loadded successfuly!"
## [1] "-> './metrics.R' script loadded successfuly!"
## [1] "-> './balance.R' script loadded successfuly!"
## [1] "-> '../lib/common-lib.R' script loadded successfuly!"
#
# Funciones especificas para este TP.
import('../scripts/helper_functions.R')
## [1] "-> '../lib/pca.R' script loadded successfuly!"
## [1] "-> './data-frame.R' script loadded successfuly!"
## [1] "-> '../lib/importance.R' script loadded successfuly!"
## [1] "-> '../scripts/helper_functions.R' script loadded successfuly!"
set.seed(1)

1. Cargamos el dataset

1.1. Excluimos las columnas no numéricas que no nos interesan para este análisis.

excluded_columns <- c(
  'Neo.Reference.ID',
  'Name',
  'Close.Approach.Date',
  'Epoch.Date.Close.Approach',
  'Orbit.ID',
  'Orbit.Determination.Date',
  'Orbiting.Body',
  'Est.Dia.in.Feet.min.',
  'Est.Dia.in.Feet.max.',
  'Est.Dia.in.M.min.',
  'Est.Dia.in.M.max.',
  'Est.Dia.in.KM.min.',
  'Est.Dia.in.KM.max.',
  'Equinox',
  'Orbit.Uncertainity'
)

1.2. Cargamos el dataset nasa-asteroids y

excluirnos observaciones con valores faltaste.

ds_step_1 <- loadcsv('../datasets/nasa.csv') %>% 
  dplyr::select(-excluded_columns) %>%
  na.omit
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(excluded_columns)` instead of `excluded_columns` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
str(ds_step_1)
## 'data.frame':    4687 obs. of  25 variables:
##  $ Absolute.Magnitude          : num  21.6 21.3 20.3 27.4 21.6 19.6 19.6 19.2 17.8 21.5 ...
##  $ Est.Dia.in.Miles.min.       : num  0.07905 0.09076 0.14385 0.00547 0.07905 ...
##  $ Est.Dia.in.Miles.max.       : num  0.1768 0.203 0.3217 0.0122 0.1768 ...
##  $ Relative.Velocity.km.per.sec: num  6.12 18.11 7.59 11.17 9.84 ...
##  $ Relative.Velocity.km.per.hr : num  22017 65210 27327 40226 35427 ...
##  $ Miles.per.hour              : num  13681 40519 16980 24995 22013 ...
##  $ Miss.Dist..Astronomical.    : num  0.419 0.383 0.051 0.285 0.408 ...
##  $ Miss.Dist..lunar.           : num  163.2 149 19.8 111 158.6 ...
##  $ Miss.Dist..kilometers.      : num  62753692 57298148 7622912 42683616 61010824 ...
##  $ Miss.Dist..miles.           : num  38993336 35603420 4736658 26522368 37910368 ...
##  $ Minimum.Orbit.Intersection  : num  0.02528 0.18693 0.04306 0.00551 0.0348 ...
##  $ Jupiter.Tisserand.Invariant : num  4.63 5.46 4.56 5.09 5.15 ...
##  $ Epoch.Osculation            : num  2458000 2458000 2458000 2458000 2458000 ...
##  $ Eccentricity                : num  0.426 0.352 0.348 0.217 0.21 ...
##  $ Semi.Major.Axis             : num  1.41 1.11 1.46 1.26 1.23 ...
##  $ Inclination                 : num  6.03 28.41 4.24 7.91 16.79 ...
##  $ Asc.Node.Longitude          : num  314.4 136.7 259.5 57.2 84.6 ...
##  $ Orbital.Period              : num  610 426 644 514 496 ...
##  $ Perihelion.Distance         : num  0.808 0.718 0.951 0.984 0.968 ...
##  $ Perihelion.Arg              : num  57.3 313.1 248.4 18.7 158.3 ...
##  $ Aphelion.Dist               : num  2.01 1.5 1.97 1.53 1.48 ...
##  $ Perihelion.Time             : num  2458162 2457795 2458120 2457902 2457814 ...
##  $ Mean.Anomaly                : num  264.8 173.7 292.9 68.7 135.1 ...
##  $ Mean.Motion                 : num  0.591 0.845 0.559 0.7 0.726 ...
##  $ Hazardous                   : chr  "True" "False" "True" "False" ...

2. Eliminamos las columnas que están altamente co-relacionadas

Excluimos las columnas que tiene un correlación mayor al 80%.

high_correlated_columns <- find_high_correlated_columns(
  feat(ds_step_1), 
  cutoff=0.8
)
length(high_correlated_columns)
## [1] 11
length(ds_step_1)
## [1] 25
ds_step_2 <- ds_step_1 %>% dplyr::select(-high_correlated_columns[-1])
rm(ds_step_1)

length(ds_step_2)
## [1] 15

3. Tomamos las columnas que mejor separan las clases.

Para realizar este paso vamos a utilizar la función feature_importance del algoritmo Random Forest. Esta función nos permite comparar las variables descuerdo a cuan buenas son para separa las clases. Luego tomaremos las 5 variables que mejor separa la clases.

result <- features_importance(ds_step_2, target = 'Hazardous')
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(target)` instead of `target` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
result
## 
## Call:
##  randomForest(x = features, y = target, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 0.34%
## Confusion matrix:
##       False True class.error
## False  3925    7 0.001780264
## True      9  746 0.011920530
plot_features_importance(result)

n_best_features = 5
# n_best_features = 15

best_features <- top_acc_features(result, top=n_best_features)
## Selecting by MeanDecreaseGini
best_features
## [1] "Minimum.Orbit.Intersection" "Absolute.Magnitude"        
## [3] "Est.Dia.in.Miles.min."      "Perihelion.Distance"       
## [5] "Inclination"
length(best_features)
## [1] 5
ds_step_3 <- ds_step_2 %>% dplyr::select(c(best_features, c(Hazardous)))
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(best_features)` instead of `best_features` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
# rm(ds_step_2)
str(ds_step_3)
## 'data.frame':    4687 obs. of  6 variables:
##  $ Minimum.Orbit.Intersection: num  0.02528 0.18693 0.04306 0.00551 0.0348 ...
##  $ Absolute.Magnitude        : num  21.6 21.3 20.3 27.4 21.6 19.6 19.6 19.2 17.8 21.5 ...
##  $ Est.Dia.in.Miles.min.     : num  0.07905 0.09076 0.14385 0.00547 0.07905 ...
##  $ Perihelion.Distance       : num  0.808 0.718 0.951 0.984 0.968 ...
##  $ Inclination               : num  6.03 28.41 4.24 7.91 16.79 ...
##  $ Hazardous                 : chr  "True" "False" "True" "False" ...

Ahora comparemos el resultado de feature_importance con PCA.

plot_pca_original_variables(feat(ds_step_3))

## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 4687 individuals, described by 5 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"

4. Transformamos las clases a números

ds_step_4 <- ds_step_3 %>% 
  mutate(Hazardous = case_when(Hazardous %in% c('True') ~ 1, TRUE ~ 0))

str(ds_step_4)
## 'data.frame':    4687 obs. of  6 variables:
##  $ Minimum.Orbit.Intersection: num  0.02528 0.18693 0.04306 0.00551 0.0348 ...
##  $ Absolute.Magnitude        : num  21.6 21.3 20.3 27.4 21.6 19.6 19.6 19.2 17.8 21.5 ...
##  $ Est.Dia.in.Miles.min.     : num  0.07905 0.09076 0.14385 0.00547 0.07905 ...
##  $ Perihelion.Distance       : num  0.808 0.718 0.951 0.984 0.968 ...
##  $ Inclination               : num  6.03 28.41 4.24 7.91 16.79 ...
##  $ Hazardous                 : num  1 0 1 0 1 0 0 0 0 1 ...

5. Análisis Exploratorio

5.1. Boxplot comparativos

coparative_boxplot(feat(ds_step_4), to_col=n_best_features)

5.2. Histogramas y densidad

comparative_histplot(feat(ds_step_4), to_col=n_best_features)

5.3. Analizamos gráfico de normalidad univariado

## [[1]]
## [1] 3.7e-24
## 
## [[2]]
## [1] 3.7e-24
## 
## [[3]]
## [1] 3.7e-24
## 
## [[4]]
## [1] 3.7e-24
## 
## [[5]]
## [1] 3.7e-24

Observaciones: Al parecer una sola variable parece ser normal.

5.3. Test de normalidad uni-variado

uni_shapiro_test(feat(ds_step_4))
## [1] "=> Variable: Minimum.Orbit.Intersection"
## [1] ""
## 
##  Shapiro-Wilk normality test
## 
## data:  features[, col_index]
## W = 0.82145, p-value < 2.2e-16
## 
## [1] "=> Variable: Absolute.Magnitude"
## [1] ""
## 
##  Shapiro-Wilk normality test
## 
## data:  features[, col_index]
## W = 0.9888, p-value < 2.2e-16
## 
## [1] "=> Variable: Est.Dia.in.Miles.min."
## [1] ""
## 
##  Shapiro-Wilk normality test
## 
## data:  features[, col_index]
## W = 0.41945, p-value < 2.2e-16
## 
## [1] "=> Variable: Perihelion.Distance"
## [1] ""
## 
##  Shapiro-Wilk normality test
## 
## data:  features[, col_index]
## W = 0.98627, p-value < 2.2e-16
## 
## [1] "=> Variable: Inclination"
## [1] ""
## 
##  Shapiro-Wilk normality test
## 
## data:  features[, col_index]
## W = 0.88555, p-value < 2.2e-16

Observaciones: En todos los casos el p-valor < 0.05 y se rechaza normalidad en todas las variables. Esto coincido con los qqplot donde en todos los casos no son normales salvo en una nunca variable donde parece tender anormalidad.

5.4. Test de normalidad muti-variado

mult_shapiro_test(feat(ds_step_4))
## 
##  Shapiro-Wilk normality test
## 
## data:  Z
## W = 0.29729, p-value < 2.2e-16

Observaciones: El p-valore < 0.05 por lo tanto se rechaza normalidad multi-variada. Se corresponde con el resultado del test de shapiro uni-variado pero no con el qqplot de cada variable. Entiendo que todos las variables se acercan a una normal, pero no o son completamente.

5.5. Test de homocedasticidad uni-variado

uni_levene_test(feat(ds_step_4), ds_step_4$Hazardous)
## [1] "=> Variable: Minimum.Orbit.Intersection"
## [1] ""
## Levene's Test for Homogeneity of Variance (center = median)
##         Df F value    Pr(>F)    
## group    1  576.34 < 2.2e-16 ***
##       4685                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] ""
## [1] ""
## [1] "=> Variable: Absolute.Magnitude"
## [1] ""
## Levene's Test for Homogeneity of Variance (center = median)
##         Df F value    Pr(>F)    
## group    1  500.87 < 2.2e-16 ***
##       4685                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] ""
## [1] ""
## [1] "=> Variable: Est.Dia.in.Miles.min."
## [1] ""
## Levene's Test for Homogeneity of Variance (center = median)
##         Df F value Pr(>F)
## group    1  0.3586 0.5493
##       4685               
## [1] ""
## [1] ""
## [1] "=> Variable: Perihelion.Distance"
## [1] ""
## Levene's Test for Homogeneity of Variance (center = median)
##         Df F value    Pr(>F)    
## group    1  16.241 5.667e-05 ***
##       4685                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] ""
## [1] ""
## [1] "=> Variable: Inclination"
## [1] ""
## Levene's Test for Homogeneity of Variance (center = median)
##         Df F value Pr(>F)  
## group    1   3.099 0.0784 .
##       4685                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] ""
## [1] ""

Observaciones

  • Minimum.Orbit.Intersection: p-valor < 0.05 -> Rechaza homocedasticidad.
  • Absolute.Magnitude: p-valor < 0.05 -> Rechaza homocedasticidad.
  • Est.Dia.in.Miles.min.: p-valor > 0.05 -> No Rechaza homocedasticidad.
  • Perihelion.Distance: p-valor < 0.05 -> Rechaza homocedasticidad.
  • Inclination: p-valor > 0.05 -> No Rechaza homocedasticidad.

5.6. Test de homocedasticidad multi-variado

multi_boxm_test(feat(ds_step_4), ds_step_4$Hazardous)
## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  features
## Chi-Sq (approx.) = 3154.8, df = 15, p-value < 2.2e-16

Observaciones: El p-valor < 0.05 por lo tanto se rechaza la hipótesis nula y podemos decir que las variables tiene no son homocedasticas.

5.7. Correlaciones entre variables

plot_correlations(feat(ds_step_4))

5.8. Análisis completo

5.9. PCA: Comparación de variables originales con/sin la variable a predecir.

## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 4687 individuals, described by 6 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"

## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 4687 individuals, described by 5 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"

5.10. PCA: Con observaciones discriminadas pro clase. Primero quitamos

outliers para poder ver con mas claridad el biplot.

ds_without_outliers <- filter_outliers(ds_step_4, max_score=0.52)

plot_robust_pca(ds_without_outliers)

6. Train test split

c(raw_train_set, raw_test_set) %<-% train_test_split(ds_step_4, train_size=.8)
## [1] "Train Set size: 3749"
## [1] "Test set size: 938"

7. Método SMOTE para balancear el dataset.

# balanced_train_set <- smote_balance(raw_train_set, raw_train_set$Hazardous)
#rm(raw_train_set)
balanced_train_set <- raw_train_set

8. Escalamos las variables numéricas(Restamos la media y dividimos por el

desvío).

train_set <- balanced_train_set %>% 
  mutate_at(vars(-Hazardous), ~(scale(.) %>% as.vector))
test_set  <- raw_test_set %>% 
  mutate_at(vars(-Hazardous), ~(scale(.) %>% as.vector))

rm(balanced_train_set)
rm(raw_test_set)

9. Entrenamos un modelo LDA

reg_formula    <- formula(Hazardous~.)
lda_model      <- lda(reg_formula, train_set)
lda_test_pred  <- predict(lda_model, test_set)

plot_cm(lda_test_pred$class, test_set$Hazardous)

graphics.off()
plot_roc(lda_test_pred$class, test_set$Hazardous)
##                                  
##  Method used: empirical          
##  ===== Positive(s) =====         
##  Number of positive(s): 154      
##  Mean of positive(s): 1.656      
##  Variance of positive(s): 0.2272 
##  ===== Negative(s) =====         
##  Number of negative(s): 784      
##  Mean of negative(s): 1.056      
##  Variance of negative(s): 0.05304
##  ===== AUC =====                 
##  Area under curve: 0.7999        
##                                  
##  =========================       
##      FPR    TPR
##  0.00000 0.0000
##  0.05612 0.6558
##  1.00000 1.0000
f_beta_score(lda_test_pred$class, test_set$Hazardous, beta=2)
## [1] "F2Score: 0.287720926667887"
lda_model$scaling
##                                    LD1
## Minimum.Orbit.Intersection -1.25718073
## Absolute.Magnitude         -1.59924349
## Est.Dia.in.Miles.min.      -0.41638970
## Perihelion.Distance         0.06717141
## Inclination                -0.01172453

10. Entrenamos un modelo QDA

qda_model      <- qda(reg_formula, train_set)
qda_test_pred  <- predict(qda_model, test_set)

plot_cm(qda_test_pred$class, test_set$Hazardous)

plot_roc(qda_test_pred$class, test_set$Hazardous)

##                                   
##  Method used: empirical           
##  ===== Positive(s) =====          
##  Number of positive(s): 154       
##  Mean of positive(s): 1.903       
##  Variance of positive(s): 0.08849 
##  ===== Negative(s) =====          
##  Number of negative(s): 784       
##  Mean of negative(s): 1.005       
##  Variance of negative(s): 0.005082
##  ===== AUC =====                  
##  Area under curve: 0.9487         
##                                   
##  =========================        
##       FPR    TPR
##  0.000000 0.0000
##  0.005102 0.9026
##  1.000000 1.0000
f_beta_score(qda_test_pred$class, test_set$Hazardous, beta=2)
## [1] "F2Score: 0.0907401672344379"

11. Entrenamos un modelo RDA

rda_model      <- rda(reg_formula, train_set)
rda_test_pred  <- predict(rda_model, test_set)

plot_cm(rda_test_pred$class, test_set$Hazardous)

plot_roc(rda_test_pred$class, test_set$Hazardous)

##                                   
##  Method used: empirical           
##  ===== Positive(s) =====          
##  Number of positive(s): 154       
##  Mean of positive(s): 1.903       
##  Variance of positive(s): 0.08849 
##  ===== Negative(s) =====          
##  Number of negative(s): 784       
##  Mean of negative(s): 1.005       
##  Variance of negative(s): 0.005082
##  ===== AUC =====                  
##  Area under curve: 0.9487         
##                                   
##  =========================        
##       FPR    TPR
##  0.000000 0.0000
##  0.005102 0.9026
##  1.000000 1.0000
f_beta_score(rda_test_pred$class, test_set$Hazardous, beta=2)
## [1] "F2Score: 0.0907401672344379"

12.Entrenamos un modelo de regresión logística

rl_model     <- glm(reg_formula, train_set, family=binomial)
rl_test_pred <- predict(rl_model, test_set)

rl_threshold <- 0.4
rl_test_pred_threshold <- ifelse(rl_test_pred >= rl_threshold, 1, 0)

plot_cm(rl_test_pred_threshold, test_set$Hazardous)

plot_roc(rl_test_pred_threshold, test_set$Hazardous)

##                                  
##  Method used: empirical          
##  ===== Positive(s) =====         
##  Number of positive(s): 154      
##  Mean of positive(s): 0.7273     
##  Variance of positive(s): 0.1996 
##  ===== Negative(s) =====         
##  Number of negative(s): 784      
##  Mean of negative(s): 0.01913    
##  Variance of negative(s): 0.01879
##  ===== AUC =====                 
##  Area under curve: 0.8541        
##                                  
##  =========================       
##      FPR    TPR
##  0.00000 0.0000
##  0.01913 0.7273
##  1.00000 1.0000
f_beta_score(rl_test_pred_threshold, test_set$Hazardous, beta=2)
## [1] "F2Score: 0.753701211305518"

13. Entrenamos un modelo SVM

svm_model     <- svm(reg_formula, train_set, kernel="radial")

svm_test_pred <- predict(svm_model, test_set)
svm_threshold <- 0.5
svm_test_pred_threshold <- ifelse(svm_test_pred >= rl_threshold, 1, 0)

plot_text_cm(svm_test_pred_threshold, test_set$Hazardous)
##        Prediction
## Reality   0   1
##       0 762  22
##       1  22 132
plot_roc(svm_test_pred_threshold, test_set$Hazardous)

##                                  
##  Method used: empirical          
##  ===== Positive(s) =====         
##  Number of positive(s): 154      
##  Mean of positive(s): 0.8571     
##  Variance of positive(s): 0.1232 
##  ===== Negative(s) =====         
##  Number of negative(s): 784      
##  Mean of negative(s): 0.02806    
##  Variance of negative(s): 0.02731
##  ===== AUC =====                 
##  Area under curve: 0.9145        
##                                  
##  =========================       
##      FPR    TPR
##  0.00000 0.0000
##  0.02806 0.8571
##  1.00000 1.0000
f_beta_score(svm_test_pred_threshold, test_set$Hazardous, beta=2)
## [1] "F2Score: 0.857142857142857"

14. KMeans Clustering

14.1. Primero definimos cuantos grupos utilizar.

Típica con estimadores de la normal.

scaled_data_1 <- ds_step_4 %>%
  dplyr::select(-Hazardous) %>%
  mutate(~(scale(.) %>% as.vector))

Escalamiento diferente de la tipica normal.

scaled_data_2 <- apply(
  ds_step_4 %>% dplyr::select(-Hazardous), 
  2, 
  function(x) { (x - min(x)) / (max(x) - min(x))}
)
clustering_metrics_plot(scaled_data_1)

clustering_metrics_plot(scaled_data_2)

14.2. Kmeans

n_clusters <- 2
kmeans_model <- kmeans(scaled_data_1, n_clusters)

km_ds_step_4 <- data.frame(ds_step_4)
km_ds_step_4$kmeans <- kmeans_model$cluster

14.2. biplot

ds_without_outliers <- filter_outliers(km_ds_step_4, max_score=0.5)

plot_robust_pca(
  ds_without_outliers %>% dplyr::select(-kmeans),
  groups = factor(ds_without_outliers$kmeans),
)

15. Hierarchical Clustering

Matriz de distancias euclídeas

mat_dist <- dist(x = ds_step_4, method = "euclidean") 

Dendrogramas (según el tipo de segmentación jerárquica aplicada)

hc_complete <- hclust(d = mat_dist, method = "complete") 
hc_average  <- hclust(d = mat_dist, method = "average")
hc_single   <- hclust(d = mat_dist, method = "single")
hc_ward     <- hclust(d = mat_dist, method = "ward.D2")

Calculo del coeficiente de correlación cofenetico

cor(x = mat_dist, cophenetic(hc_complete))
## [1] 0.8024213
cor(x = mat_dist, cophenetic(hc_average))
## [1] 0.8064622
cor(x = mat_dist, cophenetic(hc_single))
## [1] 0.5078482
cor(x = mat_dist, cophenetic(hc_ward))
## [1] 0.5889577

Construcción de un dendograma usando los resultados de la técnica de Ward

n_clusters <- 6

graphics.off()
plot(hc_ward )
rect.hclust(hc_ward, k=n_clusters, border="red")
hc_ds <- data.frame(ds_step_4)
hc_ds$her_ward <- cutree(hc_ward, k=n_clusters)
ds_without_outliers <- filter_outliers(hc_ds, max_score=0.53)

plot_robust_pca(
  ds_without_outliers %>% dplyr::select(-her_ward),
  groups = factor(ds_without_outliers$her_ward),
  colours=c("orange","cyan","blue","magenta","yellow","black"),
  labels=c("grupo 1", "grupo 2","grupo 3","grupo 4","grupo 5","grupo 6")
)

 

Realizado por Adrian Marino

adrianmarino@gmail.com